Introduction

Hi. This is Chandrasekhar and here is my solution to the pml assignment.

We first load the data, remove those columns that have too many null values and then just find correlations with the data that is left and the final classe variables.
dftr <- read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dft <- read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))

intr <- createDataPartition(dftr$classe, p = 0.7, list = F)
dfv <- dftr[-intr,]
dftr <- dftr[intr,]

bm <- sapply(select(dftr,names(dftr)[grepl("_belt",names(dftr))]),
                    function(x) sum(is.na(x)))


am <- sapply(select(dftr,names(dftr)[grepl("_arm",names(dftr))]),
                   function(x) sum(is.na(x)))


fm <- sapply(select(dftr,
                              names(dftr)[grepl("_forearm",names(dftr))]),
                       function(x) sum(is.na(x)))


dm <- sapply(select(dftr,
                               names(dftr)[grepl("_dumbbell",names(dftr))]),
                        function(x) sum(is.na(x)))


c2d <- c(names(bm[bm != 0]), 
                  names(am[am != 0]),
                  names(fm[fm != 0]),
                  names(dm[dm != 0]))


dfa <- tbl_df(dftr %>% 
                      select(-c2d,
                             -c(X,user_name, raw_timestamp_part_1, 
                                raw_timestamp_part_2, cvtd_timestamp, 
                                new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(c2d)` instead of `c2d` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dfa$classe <- as.factor(dfa$classe)
dfa[,1:52] <- lapply(dfa[,1:52],as.numeric)
cc <- cor(select(dfa, -classe))
diag(cc) <- 0
cc <- which(abs(cc)>0.8,arr.ind = T)
cc <- unique(row.names(cc))

corrplot(cor(select(dfa,cc)),
         type="lower", order="hclust",method = "number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(cc)` instead of `cc` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

cfd <- dfa %>% binarize(n_bins = 4, thresh_infreq = 0.01)
coa <- cfd %>% correlate(target = classe__A) 
coa %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
cob <- cfd %>% correlate(target = classe__B)
cob %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
coc <- cfd %>% correlate(target = classe__C)
coc %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
cod <- cfd %>% correlate(target = classe__D)
cod %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))

Let us try to plot the pairs…

acol <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y", 
           "roll_forearm", "gyros_dumbbell_y") 
bcol <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" , 
           "magnet_belt_y" , "accel_dumbbell_x" )
ccol <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" , 
           "magnet_dumbbell_x", "magnet_dumbbell_z")
dcol <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
           "accel_dumbbell_y", "accel_forearm_x")
ecol <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt", 
           "gyros_belt_z" , "magnet_dumbbell_y")

fic <- character()
for(c in c(acol,bcol,ccol,dcol,ecol)){
  fic <- union(fic, c)
}

dfa_2 <- dfa %>% select(fic, classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(fic)` instead of `fic` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",fic)), 
           "forearm" = sum(grepl("_forearm",fic)),
           "belt" = sum(grepl("_belt",fic)),
           "dumbbell" = sum(grepl("_dumbbell",fic)))
demy <- function(data, mapping, ...) {
  ggplot(data = data, mapping = mapping) +
    geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2") 
}

pomy <- function(data, mapping, ...) {
  ggplot(data = data, mapping = mapping) +
    geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2") 
}

ggpairs(dfa_2, columns = 1:5,aes(color = classe),
        lower = list(continuous = pomy),diag = list(continuous = demy))

ggpairs(dfa_2, columns = 6:10,aes(color = classe),
        lower = list(continuous = pomy),diag = list(continuous = demy))

ggpairs(dfa_2, columns = 11:17,aes(color = classe),
        lower = list(continuous = pomy),diag = list(continuous = demy))

dftrF <- dftr %>% select(fic,classe)
dfvF <- dfv %>% select(fic,classe)

dftrF[,1:17] <- sapply(dftrF[,1:17],as.numeric)
dfvF[,1:17] <- sapply(dfvF[,1:17],as.numeric)

lvs <- c("A", "B", "C", "D", "E")

ppo <- preProcess(dftrF[,-18],method = c("center","scale","BoxCox"))
xt <- predict(ppo,select(dftrF,-classe))
yt <- factor(dftrF$classe,levels=lvs)
xv <- predict(ppo,select(dfvF,-classe))
yv <- factor(dfvF$classe,levels=lvs)

ctrlt <- trainControl(method="cv", number=5)


CTm <- train(x = xt,y = yt, 
                 method = "rpart", trControl = ctrlt)


RFm <- train(x = xt,y = yt, 
                 method = "rf", trControl = ctrlt,verbose=FALSE, metric = "Accuracy")


GBMm <- train(x = xt,y = yt, 
                  method = "gbm",trControl=ctrlt, verbose=FALSE)


SVMm <- svm(x = xt,y = yt,
                kernel = "polynomial", cost = 10)
confusionMatrix(predict(CTm,xv),yv)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1515  473  486  420  144
##          B   30  383   36  179  161
##          C  128  283  504  365  291
##          D    0    0    0    0    0
##          E    1    0    0    0  486
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4907          
##                  95% CI : (0.4779, 0.5036)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3347          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9050  0.33626  0.49123   0.0000  0.44917
## Specificity            0.6383  0.91445  0.78041   1.0000  0.99979
## Pos Pred Value         0.4987  0.48542  0.32081      NaN  0.99795
## Neg Pred Value         0.9442  0.85165  0.87900   0.8362  0.88959
## Prevalence             0.2845  0.19354  0.17434   0.1638  0.18386
## Detection Rate         0.2574  0.06508  0.08564   0.0000  0.08258
## Detection Prevalence   0.5162  0.13407  0.26695   0.0000  0.08275
## Balanced Accuracy      0.7717  0.62536  0.63582   0.5000  0.72448
confusionMatrix(predict(RFm,xv),yv)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1669   13    0    2    0
##          B    5 1107   21    2    0
##          C    0   16  999   26    0
##          D    0    3    6  934    3
##          E    0    0    0    0 1079
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9835          
##                  95% CI : (0.9799, 0.9866)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9791          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9970   0.9719   0.9737   0.9689   0.9972
## Specificity            0.9964   0.9941   0.9914   0.9976   1.0000
## Pos Pred Value         0.9911   0.9753   0.9597   0.9873   1.0000
## Neg Pred Value         0.9988   0.9933   0.9944   0.9939   0.9994
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2836   0.1881   0.1698   0.1587   0.1833
## Detection Prevalence   0.2862   0.1929   0.1769   0.1607   0.1833
## Balanced Accuracy      0.9967   0.9830   0.9825   0.9832   0.9986
plot(RFm$finalModel,main="Error Graph")

confusionMatrix(predict(GBMm,xv),yv)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1635   49    0    3    4
##          B   22  972   50   25   30
##          C   11   65  942   64   22
##          D    3   52   33  864   18
##          E    3    1    1    8 1008
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9212         
##                  95% CI : (0.914, 0.9279)
##     No Information Rate : 0.2845         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9003         
##                                          
##  Mcnemar's Test P-Value : 1.428e-15      
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9767   0.8534   0.9181   0.8963   0.9316
## Specificity            0.9867   0.9732   0.9667   0.9785   0.9973
## Pos Pred Value         0.9669   0.8844   0.8533   0.8907   0.9873
## Neg Pred Value         0.9907   0.9651   0.9824   0.9797   0.9848
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2778   0.1652   0.1601   0.1468   0.1713
## Detection Prevalence   0.2873   0.1867   0.1876   0.1648   0.1735
## Balanced Accuracy      0.9817   0.9133   0.9424   0.9374   0.9645
confusionMatrix(predict(SVMm,xv),yv)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1652   65   22   17    2
##          B    6 1005   28    4   10
##          C   10   60  960   68   10
##          D    6    8   12  872   30
##          E    0    1    4    3 1030
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9378          
##                  95% CI : (0.9313, 0.9438)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9212          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9869   0.8824   0.9357   0.9046   0.9519
## Specificity            0.9748   0.9899   0.9695   0.9886   0.9983
## Pos Pred Value         0.9397   0.9544   0.8664   0.9397   0.9923
## Neg Pred Value         0.9947   0.9723   0.9862   0.9814   0.9893
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2807   0.1708   0.1631   0.1482   0.1750
## Detection Prevalence   0.2987   0.1789   0.1883   0.1577   0.1764
## Balanced Accuracy      0.9808   0.9361   0.9526   0.9466   0.9751
newdft <- dft %>% select(fic,problem_id)

xTest <- newdft %>% select(fic)
  
finalr <- data.frame("problem_id" = dft$problem_id,
                     "RF" = predict(RFm,xTest),
                     "GBM" = predict(GBMm,xTest),
                     "SVM" = predict(SVMm,xTest))

finalr

Thank you for going through my project. Have a great day!!!